• PCA looks to find a low-dimensional representation of the observations that explain a good fraction of the variance; • Clustering looks to find homogeneous subgroups among the observations.

K-means clustering is a simple and elegant approach for partitioning a data set into K distinct, non-overlapping clusters. To perform K-means clustering, we must first specify the desired number of clusters K; then the K-means algorithm will assign each observation to exactly one of the K clusters. Figure 10.5 shows the results obtained from performing K-means clustering on a simulated example consisting of 150 observations in two dimensions, using three different values of K.

source("setup.R")
## Loading required package: reshape2
## Warning: package 'reshape2' was built under R version 3.5.3
## Loading required package: gridExtra
## Warning: package 'gridExtra' was built under R version 3.5.3
## Loading required package: plotly
## Warning: package 'plotly' was built under R version 3.5.3
## Loading required package: ggplot2
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
## Loading required package: ggmosaic
## Warning: package 'ggmosaic' was built under R version 3.5.3
## Loading required package: corrgram
## Warning: package 'corrgram' was built under R version 3.5.3
## Loading required package: party
## Warning: package 'party' was built under R version 3.5.3
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 3.5.2
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 3.5.2
## Loading required package: tidyverse
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages -------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v tibble  1.4.2     v purrr   0.2.5
## v tidyr   0.8.1     v dplyr   0.7.6
## v readr   1.1.1     v stringr 1.3.1
## v tibble  1.4.2     v forcats 0.3.0
## -- Conflicts ----------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x stringr::boundary() masks strucchange::boundary()
## x dplyr::combine()    masks gridExtra::combine()
## x dplyr::filter()     masks plotly::filter(), stats::filter()
## x dplyr::lag()        masks stats::lag()
## Loading required package: RColorBrewer
## Warning: package 'RColorBrewer' was built under R version 3.5.2
## `summarise_each()` is deprecated.
## Use `summarise_all()`, `summarise_at()` or `summarise_if()` instead.
## To map `funs` over all variables, use `summarise_all()`
## Using Class, record_id as id variables
t3.data.ssq.base <- filter(t3.data.ssq, redcap_event_name == "baseline_arm_1")

t3.data.ssq.base$ssq12_dk_complete <- NULL
t3.data.ssq.base$ssq_space_mean <- as.integer(t3.data.ssq.base$ssq_space_mean)
t3.data.ssq.base$ssq_speech_mean <- as.integer(t3.data.ssq.base$ssq_speech_mean)
t3.data.ssq.base$ssq_sound_mean <- as.integer(t3.data.ssq.base$ssq_sound_mean)
t3.data.ssq.base <- na.omit(t3.data.ssq.base)


t3.data.ssq.diff <- t3.data.ssq %>% filter(redcap_event_name == "baseline_arm_1") %>% select(record_id, IsDrawerUser, IsT1DrawerUser) %>% merge(ssq_diff, by="record_id")

K-means

SSQ Baseline

set.seed(20)
clusters <- kmeans(t3.data.ssq.base[,15:17], 3)

# Save the cluster number in the dataset as column 'Borough'
t3.data.ssq.base$Cluster <- as.factor(clusters$cluster)

# Sorting
sorted <- t3.data.ssq.base %>% group_by(Cluster) %>% summarise(mean = mean(c(ssq_speech_mean, ssq_space_mean, ssq_sound_mean)), n = n())

sorted_borough <- sorted[order(sorted$mean),]$Cluster

t3.data.ssq.base$Cluster <- factor(t3.data.ssq.base$Cluster, levels = sorted_borough, ordered = TRUE)

str(clusters)
## List of 9
##  $ cluster     : Named int [1:1298] 1 1 1 2 2 1 1 1 1 3 ...
##   ..- attr(*, "names")= chr [1:1298] "2" "3" "4" "6" ...
##  $ centers     : num [1:3, 1:3] 34.3 19 63.2 60.5 26.1 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:3] "1" "2" "3"
##   .. ..$ : chr [1:3] "ssq_speech_mean" "ssq_space_mean" "ssq_sound_mean"
##  $ totss       : num 1848568
##  $ withinss    : num [1:3] 335939 151511 179622
##  $ tot.withinss: num 667072
##  $ betweenss   : num 1181496
##  $ size        : int [1:3] 647 293 358
##  $ iter        : int 3
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"
# Writing
t3.data.ssq.base %>% group_by(Cluster) %>% 
  summarise(speech = mean(ssq_speech_mean), sound = mean(ssq_sound_mean), space = mean(ssq_space_mean) , n = n())
#plotting
ggplot(t3.data.ssq.base) + 
  geom_point(aes(x=ssq_speech_mean, y=ssq_space_mean, color=Cluster))

ggplot(t3.data.ssq.base) + 
  geom_bar(aes(x=Cluster, fill=IsT1DrawerUser, color=IsDrawerUser), position='fill')+
  scale_fill_hue(l=60, c=60)

t3.data.ssq.base <- merge(t3.data.ssq.base, df_audiogram, by=c('record_id'))

ggplot(t3.data.ssq.base) + 
  geom_bar(aes(x=Cluster, fill=Class), position='fill', color="black") + 
  scale_fill_hue(l=80, c=80)

# Elbow method
k.max <- 15
data <- na.omit(t3.data.ssq.base[,15:17])

wss <- sapply(1:k.max, 
              function(k){kmeans(data, k, nstart=50,iter.max = 15 )$tot.withinss})
wss
##  [1] 3697136.2 1904148.9 1334143.5 1101252.3  946272.6  829610.7  750970.5
##  [8]  694473.6  643674.8  601272.0  563326.3  533142.5  503068.6  477330.0
## [15]  457858.6
plot(1:k.max, wss,
     type="b", pch = 19, frame = FALSE, 
     
     xlab="Number of clusters K",
     ylab="Total within-clusters sum of squares")

Makes everything lag but very 3 dimensionally

plot_ly(t3.data.ssq.base, x = ~ssq_space_mean, y = ~ssq_speech_mean, z = ~ssq_sound_mean, color = ~Cluster) %>%
  add_markers() %>%
  layout(scene = list(xaxis = list(title = 'space'),
                     yaxis = list(title = 'speech'),
                     zaxis = list(title = 'sound')))

ssq digg

t3.data.ssq.diff <- t3.data.ssq %>% filter(redcap_event_name == "baseline_arm_1") %>% select(record_id, IsDrawerUser, IsT1DrawerUser) %>% merge(ssq_diff, by="record_id")

t3.data.ssq.diff <- t3.data.ssq.diff %>% filter(!is.na(ssq_speech_mean) & !is.na(ssq_space_mean) & !is.na(ssq_sound_mean))

set.seed(20)

clusters <- kmeans(t3.data.ssq.diff[,18:20], 5)

# Save the cluster number in the dataset as column 'Borough'
t3.data.ssq.diff$Cluster <- as.factor(clusters$cluster)

# Sorting
sorted <- t3.data.ssq.diff %>% group_by(Cluster) %>% summarise(mean = mean(c(ssq_speech_mean, ssq_space_mean, ssq_sound_mean)), n = n())

sorted_borough <- sorted[order(sorted$mean),]$Cluster

t3.data.ssq.diff$Cluster <- factor(t3.data.ssq.diff$Cluster, levels = sorted_borough, ordered = TRUE)

str(clusters)
## List of 9
##  $ cluster     : int [1:1685] 5 5 1 5 1 5 3 2 5 4 ...
##  $ centers     : num [1:5, 1:3] -7.57 14.27 49.07 33.97 2.76 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:5] "1" "2" "3" "4" ...
##   .. ..$ : chr [1:3] "ssq_speech_mean" "ssq_space_mean" "ssq_sound_mean"
##  $ totss       : num 1911946
##  $ withinss    : num [1:5] 113325 122724 117586 122586 189591
##  $ tot.withinss: num 665812
##  $ betweenss   : num 1246134
##  $ size        : int [1:5] 182 319 151 275 758
##  $ iter        : int 4
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"
# Writing
t3.data.ssq.diff %>% group_by(Cluster) %>% 
  summarise(speech = mean(ssq_speech_mean), sound = mean(ssq_sound_mean), space = mean(ssq_space_mean) , n = n())
#plotting
ggplot(t3.data.ssq.diff) + 
  geom_point(aes(x=ssq_speech_mean, y=ssq_space_mean, color=Cluster))

ggplot(t3.data.ssq.diff) + 
  geom_bar(aes(x=Cluster, fill=IsT1DrawerUser, color=IsDrawerUser), position='fill')+
  scale_fill_hue(l=60, c=60)

t3.data.ssq.diff <- merge(t3.data.ssq.diff, df_audiogram, by=c('record_id'))

ggplot(t3.data.ssq.diff) + 
  geom_bar(aes(x=Cluster, fill=Class), position='fill', color="black") + 
  scale_fill_hue(l=80, c=80)

# Elbow method
k.max <- 15
data <- na.omit(t3.data.ssq.diff[,18:20])

wss <- sapply(1:k.max, 
              function(k){kmeans(data, k, nstart=50,iter.max = 15 )$tot.withinss})
wss
##  [1] 3822060.5 2150212.5 1720055.8 1496068.4 1329222.6 1194539.3 1089503.7
##  [8] 1019190.7  954276.5  892790.4  840477.5  792655.8  751667.4  718613.9
## [15]  687978.9
plot(1:k.max, wss,
     type="b", pch = 19, frame = FALSE, 
     xlab="Number of clusters K",
     ylab="Total within-clusters sum of squares")

plot_ly(t3.data.ssq.diff, x = ~ssq_space_mean, y = ~ssq_speech_mean, z = ~ssq_sound_mean, color = ~Cluster) %>%
  add_markers() %>%
  layout(scene = list(xaxis = list(title = 'space'),
                     yaxis = list(title = 'speech'),
                     zaxis = list(title = 'sound')))
ggplot(t3.data.ssq.diff) + 
  geom_bar(aes(x=Cluster, fill=IsT1DrawerUser, color=IsDrawerUser), position='fill')+
  scale_fill_hue(l=60, c=60)

t3.data.ssq.diff 

Motivation

set.seed(20)

#elbow
k.max <- 15
data <- na.omit(df_motivation[,5:6])

wss <- sapply(1:k.max, 
              function(k){kmeans(data, k, nstart=50,iter.max = 15 )$tot.withinss})
wss
##  [1] 248931.48 113535.94  80748.37  60625.13  49358.53  39878.65  31730.11
##  [8]  24878.51  21389.23  19099.97  16941.48  15498.69  14363.73  12497.27
## [15]  11626.96
plot(1:k.max, wss,
     type="b", pch = 19, frame = FALSE, 
     xlab="Number of clusters K",
     ylab="Total within-clusters sum of squares")

clusters <- kmeans(na.omit(df_motivation[,5:6]), 4)

# Save the cluster number in the dataset as column 'Borough'
df_motivation.k <- na.omit(df_motivation)
df_motivation.k$Cluster <- as.factor(clusters$cluster)

str(clusters)
## List of 9
##  $ cluster     : Named int [1:504] 4 4 4 4 4 4 3 4 4 1 ...
##   ..- attr(*, "names")= chr [1:504] "3" "4" "6" "8" ...
##  $ centers     : num [1:4, 1:2] 75 43.9 91.2 96.9 84.3 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:4] "1" "2" "3" "4"
##   .. ..$ : chr [1:2] "motivation_line_1_ver2" "motivation_line_2_ver2"
##  $ totss       : num 248931
##  $ withinss    : num [1:4] 20066 16644 10286 13671
##  $ tot.withinss: num 60666
##  $ betweenss   : num 188266
##  $ size        : int [1:4] 106 28 38 332
##  $ iter        : int 4
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"
ggplot(df_motivation.k) + 
  geom_bin2d(aes(x=motivation_line_1_ver2, y=motivation_line_2_ver2, color=Cluster))

PCA

-```{r} t3.ssq.base.pca <- prcomp(t3.data.ssq.base[,c(3:14)], center = TRUE,scale. = TRUE) summary(t3.ssq.base.pca) plot(t3.ssq.base.pca)

#elbow k.max <- 15 data <- na.omit(t3.ssq.base.pca$x[,1:3])

wss <- sapply(1:k.max, function(k){kmeans(data, k, nstart=50,iter.max = 15 )$tot.withinss}) wss

plot(1:k.max, wss, type=“b”, pch = 19, frame = FALSE, xlab=“Number of clusters K”, ylab=“Total within-clusters sum of squares”)

clusters <- kmeans(na.omit(t3.ssq.base.pca$x[,1:3]), 3)

bla <- cbind(na.omit(t3.data.ssq.base[,3:14]), t3.ssq.base.pca$rotation) ```